In [1]:
#Warning ignorance if generated

import warnings
warnings.filterwarnings("ignore")
In [2]:
#import necessary python packages for single-cell RNA SEQ analysis

import scanpy as sc #software suite of tools for single-cell analysis in python
import besca as bc #internal BEDA package for single cell analysis
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy
import anndata as ad
from scipy.sparse import csr_matrix
import scanpy.external as sce
from harmony import harmonize
import umap.umap_ as umap
print(ad.__version__)

sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)

# gives error!! sc.logging.print_versions()
INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmpqtu1q5x0
INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmpqtu1q5x0/_remote_module_non_scriptable.py
INFO:lightning_fabric.utilities.seed:Global seed set to 0
0.9.1
In [3]:
#particular displaying result settings for single cell analysis

sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')
scanpy==1.9.3 anndata==0.9.1 umap==0.3.10 numpy==1.23.5 scipy==1.10.1 pandas==2.0.1 scikit-learn==1.2.2 statsmodels==0.14.0 python-igraph==0.10.4 louvain==0.8.0 pynndescent==0.5.10
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/_settings.py:447: DeprecationWarning:

`set_matplotlib_formats` is deprecated since IPython 7.23, directly use `matplotlib_inline.backend_inline.set_matplotlib_formats()`

In [4]:
#loading all dataset into the workspace

#Load Disease PBMC dataset1 for sarcoidosis  
sarc1=sc.read_10x_mtx('/raid02/Data-live/tjana/LIB5455299_SAM24412250/outs/filtered_feature_bc_matrix', 
                      var_names='gene_symbols',cache=True)

#Load Disease PBMC dataset2 for sarcoidosis  
sarc2=sc.read_10x_mtx('/raid02/Data-live/tjana/LIB5455301_SAM24412252/outs/filtered_feature_bc_matrix', 
                      var_names='gene_symbols',cache=True)

#Load Disease PBMC dataset1 for sarcoidosis  
sarc3=sc.read_10x_mtx('/raid02/Data-live/tjana/LIB5455303_SAM24412254/outs/filtered_feature_bc_matrix', 
                      var_names='gene_symbols',cache=True)

#load Healthy PBMC Control1
healthy1=sc.read_10x_mtx('/home/jana/healthypbmc/healthy1/filtered_feature_bc_matrix', 
                         var_names='gene_symbols',cache=True)

#load Healthy PBMC Control2
healthy2=sc.read_10x_mtx('/home/jana/healthypbmc/healthy2/filtered_feature_bc_matrix', 
                         var_names='gene_symbols',cache=True)

#load Healthy PBMC Control3
healthy3=sc.read_10x_mtx('/home/jana/healthypbmc/healthy4/filtered_feature_bc_matrix', 
                         var_names='gene_symbols',cache=True)
... reading from cache file cache/raid02-Data-live-tjana-LIB5455299_SAM24412250-outs-filtered_feature_bc_matrix-matrix.h5ad
... reading from cache file cache/raid02-Data-live-tjana-LIB5455301_SAM24412252-outs-filtered_feature_bc_matrix-matrix.h5ad
... reading from cache file cache/raid02-Data-live-tjana-LIB5455303_SAM24412254-outs-filtered_feature_bc_matrix-matrix.h5ad
... reading from cache file cache/home-jana-healthypbmc-healthy1-filtered_feature_bc_matrix-matrix.h5ad
... reading from cache file cache/home-jana-healthypbmc-healthy2-filtered_feature_bc_matrix-matrix.h5ad
... reading from cache file cache/home-jana-healthypbmc-healthy4-filtered_feature_bc_matrix-matrix.h5ad
In [5]:
#Making all indexes into unique of all samples (Disease SARCOID and Healthy)

#sarcoid 
sarc1.var_names_make_unique()
sarc2.var_names_make_unique()
sarc3.var_names_make_unique()

#healthy/control
healthy1.var_names_make_unique()
healthy2.var_names_make_unique()
healthy3.var_names_make_unique()
In [6]:
# Adding some metadata for all samples

sarc1.obs['type']="Sarc"
sarc1.obs['sample']="Sarc-1"
sarc2.obs['type']="Sarc"
sarc2.obs['sample']="Sarc-2"
sarc3.obs['type']="Sarc"
sarc3.obs['sample']="Sarc-3"
healthy1.obs['type']="healthy"
healthy1.obs['sample']="healthy-1"
healthy2.obs['type']="healthy"
healthy2.obs['sample']="healthy-2"
healthy3.obs['type']="healthy"
healthy3.obs['sample']="healthy-3"


# All samples are merged into one object named adata (anndata - Annotated data)

adata = sarc1.concatenate(sarc2, sarc3, healthy1, healthy2, healthy3)

# Deleting individual datasets to save space

del(sarc1, sarc2, sarc3)
del(healthy1, healthy2, healthy3)
/home/jana/my-notebook-venv/lib/python3.8/site-packages/anndata/_core/anndata.py:1755: FutureWarning:

The AnnData.concatenate method is deprecated in favour of the anndata.concat function. Please use anndata.concat instead.

See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html

In [16]:
# Displaying merged object observation and variables types

print (adata)
print ("------")

#Displaying counts cells sample wise

print(adata.obs['sample'].value_counts())

#Displaying counts total cells counts of healthy and Sarcoid (Sarc)
print ("------")
print(adata.obs['type'].value_counts())
AnnData object with n_obs × n_vars = 74050 × 36601
    obs: 'type', 'sample', 'batch'
    var: 'gene_ids', 'feature_types'
------
sample
healthy-3    23837
healthy-1    11996
healthy-2    11996
Sarc-2       10029
Sarc-3        8754
Sarc-1        7438
Name: count, dtype: int64
------
type
healthy    47829
Sarc       26221
Name: count, dtype: int64
In [8]:
# Print merged adata var (variable) types
print (adata.var)
Out[8]:
gene_ids feature_types
MIR1302-2HG ENSG00000243485 Gene Expression
FAM138A ENSG00000237613 Gene Expression
OR4F5 ENSG00000186092 Gene Expression
AL627309.1 ENSG00000238009 Gene Expression
AL627309.3 ENSG00000239945 Gene Expression
... ... ...
AC141272.1 ENSG00000277836 Gene Expression
AC023491.2 ENSG00000278633 Gene Expression
AC007325.1 ENSG00000276017 Gene Expression
AC007325.4 ENSG00000278817 Gene Expression
AC007325.2 ENSG00000277196 Gene Expression

36601 rows × 2 columns

In [10]:
# Print merged adata obs observation types
print (adata.obs)
Out[10]:
type sample batch
AAACCCAAGACATAAC-1-0 Sarc Sarc-1 0
AAACCCAAGAGGCGGA-1-0 Sarc Sarc-1 0
AAACCCAAGCGTACAG-1-0 Sarc Sarc-1 0
AAACCCAAGGTACAAT-1-0 Sarc Sarc-1 0
AAACCCACAGCGTACC-1-0 Sarc Sarc-1 0
... ... ... ...
TTTGTTGGTTCAAGGG-1-5 healthy healthy-3 5
TTTGTTGTCACCTGGG-1-5 healthy healthy-3 5
TTTGTTGTCATTGAGC-1-5 healthy healthy-3 5
TTTGTTGTCCGATGTA-1-5 healthy healthy-3 5
TTTGTTGTCGTGGCTG-1-5 healthy healthy-3 5

74050 rows × 3 columns

In [20]:
# Identification of mitochondrial genes by giving a pattern 'MT-'
adata.var['mt'] = adata.var_names.str.startswith('MT-') 

#  Identification ribosomal genes by giving a pattern 'RPS/RPL'
adata.var['ribo'] = adata.var_names.str.startswith(("RPS","RPL"))

#  Identification hemoglobin genes by giving a pattern ^HB[^(P)]

adata.var['hb'] = adata.var_names.str.contains(("^HB[^(P)]"))

# Print merged adata varibale
print ("adata variables types including genes_id, feature types, mt, ribo and hb")
print("----------------")
print (adata.var)
adata variables types including genes_id, feature types, mt, ribo and hb
----------------
                    gene_ids    feature_types     mt   ribo     hb
MIR1302-2HG  ENSG00000243485  Gene Expression  False  False  False
FAM138A      ENSG00000237613  Gene Expression  False  False  False
OR4F5        ENSG00000186092  Gene Expression  False  False  False
AL627309.1   ENSG00000238009  Gene Expression  False  False  False
AL627309.3   ENSG00000239945  Gene Expression  False  False  False
...                      ...              ...    ...    ...    ...
AC141272.1   ENSG00000277836  Gene Expression  False  False  False
AC023491.2   ENSG00000278633  Gene Expression  False  False  False
AC007325.1   ENSG00000276017  Gene Expression  False  False  False
AC007325.4   ENSG00000278817  Gene Expression  False  False  False
AC007325.2   ENSG00000277196  Gene Expression  False  False  False

[36601 rows x 5 columns]
In [25]:
#Calculating the QC metrices of merged object adata
print ("......QC metrices....calculating")

sc.pp.calculate_qc_metrics(adata, qc_vars=['mt','ribo','hb'], percent_top=None, log1p=False, inplace=True)

print ("......QC metrices....calculating done")
......QC metrices....calculating
......QC metrices....calculating done
In [27]:
#Now you can see that we have additional data in the scanpy obs slot.
print("Computing cell compute fraction of counts in mito genes vs. all genes")
mito_genes = adata.var_names.str.startswith('MT-')

# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
adata.obs['percent_mt2'] = np.sum(
    adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1

# Adding the total counts per cell as observations-annotation to adata
adata.obs['n_counts'] = adata.X.sum(axis=1).A1
print ("computing done")
Computing cell compute fraction of counts in mito genes vs. all genes
computing done
In [33]:
#Violin plot of whole QC metrices of merged object in samples wise 
print ("Pre QC metrices")
sc.pl.violin(adata, ['total_counts', 'n_genes_by_counts', 'pct_counts_mt','pct_counts_ribo', 'pct_counts_hb'],
             jitter=0.4, groupby = 'sample', rotation= 45, multi_panel=True)
Pre QC metrices
In [34]:
#Scatter plot: (total counts vs. pct_counts_mt) & (total counts vs  n_genes_by_counts):  sample wise
print ("Scatter plots")
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt', color="sample")
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts', color="sample")
Scatter plots
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_anndata.py:490: MatplotlibDeprecationWarning:

The legendHandles attribute was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use legend_handles instead.

/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_anndata.py:490: MatplotlibDeprecationWarning:

The legendHandles attribute was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use legend_handles instead.

In [36]:
#  Basic Filtering with minimum number of  cells and genes

sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

# Displaying Number of observations and Number of variables/features
print  ("adata contains observations and variables")

print(adata.n_obs, adata.n_vars)
adata contains observations and variables
73171 26137
In [37]:
# Fixing cutoff for mt and ribo for percent mito

print ("Fixing cutoff for mt and ribo for post processing")

adata = adata[adata.obs['pct_counts_mt'] < 20, :]

# filter for percent ribo > 0.05
adata = adata[adata.obs['pct_counts_ribo'] > 5, :]


print("Remaining cells %d"%adata.n_obs)
Fixing cutoff for mt and ribo for post processing
Remaining cells 71184
In [ ]:
#After fitering mito and ribo part, the Violin plot of whole QC metrices of merged object in samples wise
print ("final QC cutoffs for post processing")

sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt','pct_counts_ribo', 'pct_counts_hb'],
             jitter=0.4, groupby = 'sample', rotation = 45)
In [38]:
#top 20 Highest expressing genes 
print ("Highest expressed genes post QC cutoff chosen")

sc.pl.highest_expr_genes(adata, n_top=20)
Highest expressed genes post QC cutoff chosen
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/preprocessing/_normalization.py:170: UserWarning:

Received a view of an AnnData. Making a copy.

normalizing counts per cell
    finished (0:00:01)
In [39]:
# we need to redefine the mito_genes,hb_genes, since they were first 
# calculated on the full object before removing low expressed genes.
print ("Removing some unwanted genes named MALAT1")
malat1 = adata.var_names.str.startswith('MALAT1')

mito_genes = adata.var_names.str.startswith('MT-')
hb_genes = adata.var_names.str.contains('^HB[^(P)]')

remove = np.add(mito_genes, malat1)
remove = np.add(remove, hb_genes)
keep = np.invert(remove)

adata = adata[:,keep]

print(adata.n_obs, adata.n_vars)
Removing some unwanted genes named MALAT1
71184 26113
In [40]:
#top 20 Highest expressing genes 
print ("After removing some unwanted genes, current top 20 highest expressed genes")

sc.pl.highest_expr_genes(adata, n_top=20)
After removing some unwanted genes, current top 20 highest expressed genes
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/preprocessing/_normalization.py:170: UserWarning:

Received a view of an AnnData. Making a copy.

normalizing counts per cell
    finished (0:00:01)
In [42]:
# adata raw is assigned with adata for post processing
adata.raw = adata
                
In [43]:
#Doublet detetection

print ("Doublet detection")
import scrublet as scr

# split per batch into new objects.
batches = adata.obs['sample'].cat.categories.tolist()
alldata = {}
for batch in batches:
    tmp = adata[adata.obs['sample'] == batch,]
    print(batch, ":", tmp.shape[0], " cells")
    scrub = scr.Scrublet(tmp.raw.X)
    out = scrub.scrub_doublets(verbose=False, n_prin_comps = 20)
    alldata[batch] = pd.DataFrame({'doublet_score':out[0],'predicted_doublets':out[1]},index = tmp.obs.index)
    print(alldata[batch].predicted_doublets.sum(), " predicted_doublets")
    
    
Doublet detection
Sarc-1 : 6854  cells
294  predicted_doublets
Sarc-2 : 9684  cells
628  predicted_doublets
Sarc-3 : 8238  cells
448  predicted_doublets
healthy-1 : 11654  cells
671  predicted_doublets
healthy-2 : 11654  cells
671  predicted_doublets
healthy-3 : 23100  cells
1126  predicted_doublets
In [44]:
# Doublet detector predictions adding to the adata object.

scrub_pred = pd.concat(alldata.values())
adata.obs['doublet_scores'] = scrub_pred['doublet_score'] 
adata.obs['predicted_doublets'] = scrub_pred['predicted_doublets'] 

sum(adata.obs['predicted_doublets'])
Out[44]:
3838
In [45]:
# add in column with singlet/doublet instead of True/Fals
%matplotlib inline

adata.obs['doublet_info'] = adata.obs["predicted_doublets"].astype(str)

sc.pl.violin(adata, 'n_genes_by_counts',
             jitter=0.4, groupby = 'doublet_info', rotation=45)
... storing 'doublet_info' as categorical
In [46]:
#sandard Pipeline used here

print ("Standard pipeline SC RNA seq")

#Normalize the data
sc.pp.normalize_total(adata)

#Logarithmize the data
sc.pp.log1p(adata)

#Identify highly-variable genes
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)

#displaying Highly variable genes before normalize and after normalization
sc.pl.highly_variable_genes(adata)

#Getting back to an AnnData of the object in .raw by calling .raw.to_adata().
adata.raw = adata

#filtering highly variable genes 
adata = adata[:, adata.var.highly_variable]


#Regressing out effects of total counts per cell and the percentage of mitochondrial genes expressed
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])

#Scale the data to unit variance and Clip values exceeding standard deviation 10
sc.pp.scale(adata, max_value=10)

#Reduce the dimensionality of the data by running principal component analysis (PCA). Denoising the data
sc.tl.pca(adata, svd_solver='arpack')

#scatter plot generation in the PCA coordinates
sc.pl.pca(adata, color='CST3')

#PCA total variance data

from numba import njit
sc.pl.pca_variance_ratio(adata, log=True)
Standard pipeline SC RNA seq
normalizing counts per cell
    finished (0:00:01)
extracting highly variable genes
    finished (0:00:06)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
regressing out ['total_counts', 'pct_counts_mt']
    sparse input is densified and may lead to high memory use
    finished (0:03:53)
computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:13)
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:163: MatplotlibDeprecationWarning:

The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.

In [47]:
#Computing the neighborhood graph
#Neighborhood graph of cells using the PCA representation of the data matrix computation.

print ("Computing neighborhood graphs and Clustering")
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=50)

#Clustering the neighborhood graph using Leiden Clustering algorithm
sc.tl.leiden(adata)
sc.tl.umap(adata)

#PLoting UMAP with clusters using Leiden algorithm 
sc.pl.umap(adata, color=['leiden', 'CST3', 'NKG7'])
Computing neighborhood graphs and Clustering
computing neighbors
    using 'X_pca' with n_pcs = 50
/home/jana/my-notebook-venv/lib/python3.8/site-packages/numba/core/typed_passes.py:334: NumbaPerformanceWarning:


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.readthedocs.io/en/stable/user/parallel.html#diagnostics for help.

File "my-notebook-venv/lib/python3.8/site-packages/umap/rp_tree.py", line 135:
@numba.njit(fastmath=True, nogil=True, parallel=True)
def euclidean_random_projection_split(data, indices, rng_state):
^


/home/jana/my-notebook-venv/lib/python3.8/site-packages/umap/nndescent.py:91: NumbaPerformanceWarning:


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.readthedocs.io/en/stable/user/parallel.html#diagnostics for help.

File "my-notebook-venv/lib/python3.8/site-packages/umap/utils.py", line 409:
@numba.njit(parallel=True)
def build_candidates(current_graph, n_vertices, n_neighbors, max_candidates, rng_state):
^


/home/jana/my-notebook-venv/lib/python3.8/site-packages/numba/core/typed_passes.py:334: NumbaPerformanceWarning:


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.readthedocs.io/en/stable/user/parallel.html#diagnostics for help.

File "my-notebook-venv/lib/python3.8/site-packages/umap/nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^


    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:34)
running Leiden clustering
    finished: found 37 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:11)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:01:31)
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:163: MatplotlibDeprecationWarning:

The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.

/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning:

No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored

In [48]:
#Displaying Doublet scores, Doublet info and Sample wise distrubtion 
print ("Doublet finding plots with scores and info across the samples")

sc.pl.umap(adata, color=['doublet_scores','doublet_info','sample'])
Doublet finding plots with scores and info across the samples
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:163: MatplotlibDeprecationWarning:

The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.

/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning:

No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored

/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning:

No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored

In [49]:
#Doublet removing and Rest samples for post processing
print ("Considering the False Doublet finding information, means we are considering non doublet portion for the rest of tha analysis")
# also revert back to the raw counts as the main matrix in adata
adata = adata.raw.to_adata() 

adata = adata[adata.obs['doublet_info'] == 'False',:]

print ("Current shape of the data")
print(adata.shape)
Considering the False Doublet finding information, means we are considering singlet portion for the rest of tha analysis
Current shape of the data
(67346, 26113)
In [51]:
#umap of the current data after the doublet removal

sc.pl.umap(adata, color=['sample','leiden'])
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:163: MatplotlibDeprecationWarning:

The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.

/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning:

No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored

/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning:

No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored

In [53]:
#BatchCorrection is done using Harmony
print ("Batch correction using Harmony algorithm")
sce.pp.harmony_integrate(adata, 'sample')
2023-11-08 14:05:27,443 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...
INFO:harmonypy:Computing initial centroids with sklearn.KMeans...
Batch correction using Harmony algorithm
2023-11-08 14:06:03,324 - harmonypy - INFO - sklearn.KMeans initialization complete.
INFO:harmonypy:sklearn.KMeans initialization complete.
2023-11-08 14:06:04,121 - harmonypy - INFO - Iteration 1 of 10
INFO:harmonypy:Iteration 1 of 10
2023-11-08 14:06:51,663 - harmonypy - INFO - Iteration 2 of 10
INFO:harmonypy:Iteration 2 of 10
2023-11-08 14:07:37,956 - harmonypy - INFO - Iteration 3 of 10
INFO:harmonypy:Iteration 3 of 10
2023-11-08 14:08:25,802 - harmonypy - INFO - Iteration 4 of 10
INFO:harmonypy:Iteration 4 of 10
2023-11-08 14:09:12,448 - harmonypy - INFO - Converged after 4 iterations
INFO:harmonypy:Converged after 4 iterations
In [54]:
#Post proccssing of Harmonization
print ("Post proccssing of Harmonization")
#current PCA is aligned to Harmonized PCA values 
adata.obsm['X_pca'] = adata.obsm['X_pca_harmony']

#Computing neighborhood graphs and Clustering
print ("Computing neighborhood graphs and Clustering")
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=50)

#Clustering the neighborhood graph using Leiden Clustering algorithm
sc.tl.leiden(adata)
sc.tl.umap(adata)
Post proccssing of Harmonization
Computing neighborhood graphs and Clustering
computing neighbors
    using 'X_pca' with n_pcs = 50
/home/jana/my-notebook-venv/lib/python3.8/site-packages/numba/core/typed_passes.py:334: NumbaPerformanceWarning:


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.readthedocs.io/en/stable/user/parallel.html#diagnostics for help.

File "my-notebook-venv/lib/python3.8/site-packages/umap/nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^


    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:21)
running Leiden clustering
    finished: found 25 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:13)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:01:27)
In [55]:
#UMAP cluster view Sample wise and cluster wise
sc.pl.umap(adata, color=['sample', 'leiden'])
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:163: MatplotlibDeprecationWarning:

The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.

/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning:

No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored

/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning:

No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored

In [56]:
#display anndata
adata
Out[56]:
AnnData object with n_obs × n_vars = 67346 × 26113
    obs: 'type', 'sample', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2', 'n_counts', 'n_genes', 'doublet_scores', 'predicted_doublets', 'doublet_info', 'leiden'
    var: 'gene_ids', 'feature_types', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'sample_colors', 'doublet_info_colors', 'log1p', 'hvg', 'pca', 'neighbors', 'leiden', 'umap', 'leiden_colors'
    obsm: 'X_pca', 'X_umap', 'X_pca_harmony'
    obsp: 'distances', 'connectivities'
In [57]:
#import scipy io package
from scipy import io

save_file = '/home/jana/scanpy_qc_filtered_pbmcs_for_sarcoid.h5ad'
adata.write_h5ad(save_file)